In [1]:
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score, davies_bouldin_score
In [2]:
df = pd.read_csv("heart_clean.csv")
df
Out[2]:
index age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target age_bin
0 0 52 1 0 125 212.0 0 1 168 0 1.0 2 2 3 0 (50, 60]
1 1 53 1 0 140 203.0 1 0 155 1 3.1 0 0 3 0 (50, 60]
2 2 70 1 0 145 174.0 0 1 125 1 2.6 0 0 3 0 (60, 70]
3 3 61 1 0 148 203.0 0 1 161 0 0.0 2 1 3 0 (60, 70]
4 4 62 0 0 138 294.0 1 1 106 0 1.9 1 3 2 0 (60, 70]
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297 723 68 0 2 120 211.0 0 0 115 0 1.5 1 0 2 1 (60, 70]
298 733 44 0 2 108 141.0 0 1 175 0 0.6 1 0 2 1 (40, 50]
299 739 52 1 0 128 255.0 0 1 161 1 0.0 2 1 3 0 (50, 60]
300 843 59 1 3 160 273.0 0 0 125 0 0.0 2 0 2 0 (50, 60]
301 878 54 1 0 120 188.0 0 1 113 0 1.4 1 1 3 0 (50, 60]

302 rows × 16 columns

In [3]:
df.drop(columns=["age_bin"], inplace=True)
In [4]:
f"Usia Minimal = {df['age'].min()}, Usia Maksimal = {df['age'].max()}"
Out[4]:
'Usia Minimal = 29, Usia Maksimal = 77'
In [5]:
df.groupby(["target"])["target"].count().reset_index(name='total_target')
Out[5]:
target total_target
0 0 138
1 1 164
In [6]:
# Define age bins and labels
age_bins = range(20, 90, 10)  # 0-9, 10-19, ..., 90-100
age_labels = [f"{i}-{i+9}" for i in age_bins[:-1]]
df['age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)
In [7]:
age_target_avg = df.groupby(["age_group","target"])["target"].count().reset_index(name='total_age_category')
age_target_avg = age_target_avg[age_target_avg["target"]==1]
age_target_avg = age_target_avg.total_age_category.mean()
f'Nilai rata-rata dari pengidap serangan jantung di setiap usia {round(age_target_avg,0)}'
/tmp/ipykernel_3504/3424322441.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  age_target_avg = df.groupby(["age_group","target"])["target"].count().reset_index(name='total_age_category')
Out[7]:
'Nilai rata-rata dari pengidap serangan jantung di setiap usia 27.0'
In [8]:
import plotly.graph_objects as go

fig = go.Figure()

# Add histogram trace for "Heart Disease Absent" (target = 0) in grey
fig.add_trace(go.Histogram(
    x=df[df['target'] == 0]['age'],
    marker=dict(color='grey'),
    name='No Heart Disease',
    xbins=dict(size=10)  
))

# Add histogram trace for "Heart Disease Present" (target = 1) in red
fig.add_trace(go.Histogram(
    x=df[df['target'] == 1]['age'],
    marker=dict(color='#C62E2E'),
    name='Heart Disease',
    xbins=dict(size=10)  
))

# Add dashed line for average count
fig.add_shape(
    type="line",
    x0=0, x1=1, y0=27, y1=27,  
    xref="paper", yref="y",
    line=dict(color="#4A4947", width=2, dash="dash")
)

# Update layout
fig.update_layout(
    title=dict(
        text="Heart Disease Count by Age",
        font=dict(size=20, color="black", family="Arial", weight="bold"),
        x=0.08,
        xanchor="left"
    ),
    xaxis=dict(
        title="Age",
        title_font=dict(size=14, weight="bold"),
        tickmode="linear",
        dtick=10,
        showline=True,
        linecolor="black",
        linewidth=2
    ),
    yaxis=dict(
        title=None,
        showline=True,
        showticklabels=False,
        linewidth=2,
    
        
        
    ),
    plot_bgcolor="white",
    bargap=0.2,  
    barmode='group' 
)

# Add hover template for custom text on hover
fig.update_traces(
    hovertemplate='Age Range: %{x}<br>Count: %{y}',
)
# Adjust annotation with flexible positioning
fig.add_annotation(
    text="The age range of <b>40 to 69 is the most likely<br>age for an individual to experience a heart <br>attack</b>, as the number of heart attack cases<br>in this age group is above the average of<br>other age groups.",
    xref='paper',
    yref='paper',
    x=0.335, 
    y=1.1,  
    showarrow=False,
    xanchor="right",
    font=dict(size=12, color='black'),
    align='left'
)

# Adjust annotation with flexible positioning
fig.add_annotation(
    text=f'<b>Heart Disease avg (27)</b> ', 
    xref='paper',
    yref='paper',
    x=1.03,  
    y=0.43,  
    showarrow=False,
    xanchor="right",
    font=dict(size=12, color='#4A4947'),
    align='left'
)

# Show plot
fig.show()
In [9]:
pyramid_data = df.groupby(['age_group', 'sex']).size().unstack(fill_value=0)
pyramid_data.columns = ['Female', 'Male']
pyramid_data
/tmp/ipykernel_3504/985610881.py:1: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

Out[9]:
Female Male
age_group
20-29 0 1
30-39 5 9
40-49 19 53
50-59 34 91
60-69 33 47
70-79 5 5
In [10]:
# Hitung jumlah total berdasarkan jenis kelamin dan kelompok usia
pyramid_data = df.groupby(['age_group', 'sex']).size().unstack(fill_value=0)
pyramid_data.columns = ['Female', 'Male']

# Membuat nilai female menjadi negatif untuk visualisasi piramida
pyramid_data['Female'] = -pyramid_data['Female']
total_females = pyramid_data['Female'].abs().sum()
total_males = pyramid_data['Male'].sum()

# Membuat figure
fig = go.Figure()

# Tambahkan bar untuk female (sisi kiri, nilai negatif)
fig.add_trace(go.Bar(
    y=pyramid_data.index,
    x=pyramid_data['Female'],
    name='Female',
    orientation='h',
    marker=dict(color='salmon'),
    customdata=[abs(val) for val in pyramid_data['Female']], 
    hovertemplate='Age Range: %{y}<br>Count: %{customdata}' 
))

# Tambahkan bar untuk male (sisi kanan)
fig.add_trace(go.Bar(
    y=pyramid_data.index,
    x=pyramid_data['Male'],
    name='Male',
    orientation='h',
    marker=dict(color='teal'),
    hovertemplate='Age Range: %{y}<br>Count: %{x}'  
))

# Pengaturan layout
fig.update_layout(
    title='Number of Sample by Age and Sex',
    xaxis=dict(
        title=None,
        showline=True,
        showticklabels=False,
    ),
    yaxis=dict(
        title='Age Group',
        categoryorder='category ascending'
    ),
    barmode='relative',
    bargap=0.12,
    plot_bgcolor='white',
    showlegend=True
)

# Tambahkan anotasi dengan posisi fleksibel
fig.add_annotation(
    text=f'Total sample <b>male domination: {(round(total_males / (total_males + total_females), 2)) * 100}%</b> '
         f'with females: {total_females} and males: {total_males}',    
    xref='paper',
    yref='paper',
    x=0.45, 
    y=1.1,
    showarrow=False,
    xanchor="right",
    font=dict(size=12, color='black'),
    align='left'
)

fig.show()
/tmp/ipykernel_3504/3835416259.py:2: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

In [11]:
gender = df[["sex","target","age_group"]]
gender.head()
Out[11]:
sex target age_group
0 1 0 50-59
1 1 0 50-59
2 1 0 70-79
3 1 0 60-69
4 0 0 60-69
In [12]:
gender = df[["sex", "target", "age_group"]]
one_hot_encoded_data = pd.get_dummies(gender, columns=['target', 'sex'])
one_hot_encoded_data.groupby(["age_group"])["target_0"].count().reset_index()
/tmp/ipykernel_3504/904798733.py:3: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

Out[12]:
age_group target_0
0 20-29 1
1 30-39 14
2 40-49 72
3 50-59 125
4 60-69 80
5 70-79 10
In [13]:
one_hot_encoded_data.columns
Out[13]:
Index(['age_group', 'target_0', 'target_1', 'sex_0', 'sex_1'], dtype='object')
In [14]:
sex_mapping = {
     0 : 'Female',
     1 : 'Male'
}
# Create the cp_category column based on the cp column
df['sex_category'] = df['sex'].map(sex_mapping)
df.head()
Out[14]:
index age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target age_group sex_category
0 0 52 1 0 125 212.0 0 1 168 0 1.0 2 2 3 0 50-59 Male
1 1 53 1 0 140 203.0 1 0 155 1 3.1 0 0 3 0 50-59 Male
2 2 70 1 0 145 174.0 0 1 125 1 2.6 0 0 3 0 70-79 Male
3 3 61 1 0 148 203.0 0 1 161 0 0.0 2 1 3 0 60-69 Male
4 4 62 0 0 138 294.0 1 1 106 0 1.9 1 3 2 0 60-69 Female
In [15]:
sex_percentage = df.groupby(["sex_category","target"])["target"].count().reset_index(name='total_target')
# Calculate the total for each cp_category
total_counts = sex_percentage.groupby('sex_category')['total_target'].transform('sum')

# Calculate the percentage for each target within each cp_category
sex_percentage['percentage'] = (round(sex_percentage['total_target'] / total_counts,2)) * 100
sex_percentage
Out[15]:
sex_category target total_target percentage
0 Female 0 24 25.0
1 Female 1 72 75.0
2 Male 0 114 55.0
3 Male 1 92 45.0
In [16]:
import plotly.graph_objects as go


# Define colors
color_target_0 = '#c5c5c5'   
color_target_1 = '#ff6b6b'   
highlight_color = '#ff0000'    

# Filter data for each target
no_heart_disease = sex_percentage[sex_percentage['target'] == 0]
heart_disease = sex_percentage[sex_percentage['target'] == 1]

# Identify the category with the highest heart disease percentage
if not heart_disease.empty:
    max_index = heart_disease['percentage'].idxmax()
    max_category = heart_disease.loc[max_index, 'sex_category']
else:
    max_category = None  

# Assign colors for 'Heart Disease' bars, highlighting the max category
heart_disease_colors = heart_disease['sex_category'].apply(
    lambda x: highlight_color if x == max_category else color_target_1
).tolist()

fig = go.Figure()

# Add 'No Heart Disease' bars
fig.add_trace(go.Bar(
    y=no_heart_disease['sex_category'],
    x=no_heart_disease['percentage'],
    name='No Heart Disease',
    orientation='h',
    marker=dict(color=color_target_0),
    text=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}% No Heart Disease")
))

# Add 'Heart Disease' bars with conditional coloring
fig.add_trace(go.Bar(
    y=heart_disease['sex_category'],
    x=heart_disease['percentage'],
    name='Heart Disease',
    orientation='h',
    marker=dict(color=heart_disease_colors),
    text=heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=heart_disease['percentage'].apply(lambda x: f"{int(x)}% Heart Disease")
))

# Update layout
fig.update_layout(
    title=dict(
        text="Distribution of Heart Disease by Gender",
        font=dict(size=24, color="black", family="Arial", weight="bold"),
        x=0,
        xanchor="left"
    ),
    xaxis=dict(
        title=None,
        tickvals=[0, 20, 40, 60, 80, 100],
        ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
        range=[0, 100],
        tickfont=dict(size=15)
    ),
    yaxis=dict(
        title=None,
        tickfont=dict(size=15)
    ),
    barmode='stack',  
    plot_bgcolor="white",
    showlegend=True,
    legend=dict(
        orientation="h",         
        yanchor="bottom",
        y=-0.3,                   
        xanchor="center",
        x=0.5,
        traceorder="normal",
        font=dict(size=15),
        itemclick="toggleothers",
        
    )
)

fig.show()
In [17]:
cp_mapping = {
     0 : 'Typical angina',
     1 : 'Atypical angina',
     2 : 'Non-anginal pain',
     3 : 'Asymptomatic'
}
# Create the cp_category column based on the cp column
df['cp_category'] = df['cp'].map(cp_mapping)
df
Out[17]:
index age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target age_group sex_category cp_category
0 0 52 1 0 125 212.0 0 1 168 0 1.0 2 2 3 0 50-59 Male Typical angina
1 1 53 1 0 140 203.0 1 0 155 1 3.1 0 0 3 0 50-59 Male Typical angina
2 2 70 1 0 145 174.0 0 1 125 1 2.6 0 0 3 0 70-79 Male Typical angina
3 3 61 1 0 148 203.0 0 1 161 0 0.0 2 1 3 0 60-69 Male Typical angina
4 4 62 0 0 138 294.0 1 1 106 0 1.9 1 3 2 0 60-69 Female Typical angina
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297 723 68 0 2 120 211.0 0 0 115 0 1.5 1 0 2 1 60-69 Female Non-anginal pain
298 733 44 0 2 108 141.0 0 1 175 0 0.6 1 0 2 1 40-49 Female Non-anginal pain
299 739 52 1 0 128 255.0 0 1 161 1 0.0 2 1 3 0 50-59 Male Typical angina
300 843 59 1 3 160 273.0 0 0 125 0 0.0 2 0 2 0 50-59 Male Asymptomatic
301 878 54 1 0 120 188.0 0 1 113 0 1.4 1 1 3 0 50-59 Male Typical angina

302 rows × 18 columns

In [18]:
df["cp_category"].unique()
Out[18]:
array(['Typical angina', 'Atypical angina', 'Non-anginal pain',
       'Asymptomatic'], dtype=object)
In [19]:
cp_percentage = df.groupby(["cp_category","target"])["target"].count().reset_index(name='total_target')
In [20]:
# Calculate the total for each cp_category
total_counts = cp_percentage.groupby('cp_category')['total_target'].transform('sum')

# Calculate the percentage for each target within each cp_category
cp_percentage['percentage'] = (round(cp_percentage['total_target'] / total_counts,2)) * 100
cp_percentage
Out[20]:
cp_category target total_target percentage
0 Asymptomatic 0 7 30.0
1 Asymptomatic 1 16 70.0
2 Atypical angina 0 9 18.0
3 Atypical angina 1 41 82.0
4 Non-anginal pain 0 18 21.0
5 Non-anginal pain 1 68 79.0
6 Typical angina 0 104 73.0
7 Typical angina 1 39 27.0
In [21]:
import plotly.graph_objects as go


# Define colors
color_target_0 = '#c5c5c5'   
color_target_1 = '#ff6b6b'   
highlight_color = '#ff0000'    

# Filter data for each target
no_heart_disease = cp_percentage[cp_percentage['target'] == 0]
heart_disease = cp_percentage[cp_percentage['target'] == 1]

# Identify the category with the highest heart disease percentage
if not heart_disease.empty:
    max_index = heart_disease['percentage'].idxmax()
    max_category = heart_disease.loc[max_index, 'cp_category']
else:
    max_category = None  

# Assign colors for 'Heart Disease' bars, highlighting the max category
heart_disease_colors = heart_disease['cp_category'].apply(
    lambda x: highlight_color if x == max_category else color_target_1
).tolist()

fig = go.Figure()

# Add 'No Heart Disease' bars
fig.add_trace(go.Bar(
    y=no_heart_disease['cp_category'],
    x=no_heart_disease['percentage'],
    name='No Heart Disease',
    orientation='h',
    marker=dict(color=color_target_0),
    text=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}% No Heart Disease")
))

# Add 'Heart Disease' bars with conditional coloring
fig.add_trace(go.Bar(
    y=heart_disease['cp_category'],
    x=heart_disease['percentage'],
    name='Heart Disease',
    orientation='h',
    marker=dict(color=heart_disease_colors),
    text=heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=heart_disease['percentage'].apply(lambda x: f"{int(x)}% Heart Disease")
))

# Update layout
fig.update_layout(
    title=dict(
        text="Distribution of Heart Disease by Chest Pain Type",
        font=dict(size=24, color="black", family="Arial", weight="bold"),
        x=0,
        xanchor="left"
    ),
    xaxis=dict(
        title=None,
        tickvals=[0, 20, 40, 60, 80, 100],
        ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
        range=[0, 100],
        tickfont=dict(size=15)
    ),
    yaxis=dict(
        title=None,
        tickfont=dict(size=15)
    ),
    barmode='stack',  
    plot_bgcolor="white",
    showlegend=True,
    legend=dict(
        orientation="h",         
        yanchor="bottom",
        y=-0.3,                   
        xanchor="center",
        x=0.5,
        traceorder="normal",
        font=dict(size=15),
        itemclick="toggleothers",
        
    )
)

fig.show()
In [22]:
df["fbs"].unique()
Out[22]:
array([0, 1])
In [23]:
cp_mapping = {
     0 : '<= 120 mg/dl',
     1 : '> 120 mg/dl',}
# Create the cp_category column based on the cp column
df['fbs_category'] = df['fbs'].map(cp_mapping)
df
Out[23]:
index age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target age_group sex_category cp_category fbs_category
0 0 52 1 0 125 212.0 0 1 168 0 1.0 2 2 3 0 50-59 Male Typical angina <= 120 mg/dl
1 1 53 1 0 140 203.0 1 0 155 1 3.1 0 0 3 0 50-59 Male Typical angina > 120 mg/dl
2 2 70 1 0 145 174.0 0 1 125 1 2.6 0 0 3 0 70-79 Male Typical angina <= 120 mg/dl
3 3 61 1 0 148 203.0 0 1 161 0 0.0 2 1 3 0 60-69 Male Typical angina <= 120 mg/dl
4 4 62 0 0 138 294.0 1 1 106 0 1.9 1 3 2 0 60-69 Female Typical angina > 120 mg/dl
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297 723 68 0 2 120 211.0 0 0 115 0 1.5 1 0 2 1 60-69 Female Non-anginal pain <= 120 mg/dl
298 733 44 0 2 108 141.0 0 1 175 0 0.6 1 0 2 1 40-49 Female Non-anginal pain <= 120 mg/dl
299 739 52 1 0 128 255.0 0 1 161 1 0.0 2 1 3 0 50-59 Male Typical angina <= 120 mg/dl
300 843 59 1 3 160 273.0 0 0 125 0 0.0 2 0 2 0 50-59 Male Asymptomatic <= 120 mg/dl
301 878 54 1 0 120 188.0 0 1 113 0 1.4 1 1 3 0 50-59 Male Typical angina <= 120 mg/dl

302 rows × 19 columns

In [24]:
fbs_percentage = df.groupby(["fbs_category","target"])["target"].count().reset_index(name='total_target')
# Calculate the total for each cp_category
total_counts = fbs_percentage.groupby('fbs_category')['total_target'].transform('sum')

# Calculate the percentage for each target within each cp_category
fbs_percentage['percentage'] = (round(fbs_percentage['total_target'] / total_counts,2)) * 100
fbs_percentage
Out[24]:
fbs_category target total_target percentage
0 <= 120 mg/dl 0 116 45.0
1 <= 120 mg/dl 1 141 55.0
2 > 120 mg/dl 0 22 49.0
3 > 120 mg/dl 1 23 51.0
In [25]:
import plotly.graph_objects as go

# Define colors
color_target_0 = '#c5c5c5'   
color_target_1 = '#ff6b6b'   
highlight_color = '#ff0000'    

# Filter data for each target
no_heart_disease = fbs_percentage[fbs_percentage['target'] == 0]
heart_disease = fbs_percentage[fbs_percentage['target'] == 1]

# Identify the category with the highest heart disease percentage
if not heart_disease.empty:
    max_index = heart_disease['percentage'].idxmax()
    max_category = heart_disease.loc[max_index, 'fbs_category']
else:
    max_category = None 
    
# Assign colors for 'Heart Disease' bars, highlighting the max category
heart_disease_colors = heart_disease['fbs_category'].apply(
    lambda x: highlight_color if x == max_category else color_target_1
).tolist()

fig = go.Figure()

# Add 'No Heart Disease' bars
fig.add_trace(go.Bar(
    y=no_heart_disease['fbs_category'],
    x=no_heart_disease['percentage'],
    name='No Heart Disease',
    orientation='h',
    marker=dict(color=color_target_0),
    text=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}% No Heart Disease")
))

# Add 'Heart Disease' bars with conditional coloring
fig.add_trace(go.Bar(
    y=heart_disease['fbs_category'],
    x=heart_disease['percentage'],
    name='Heart Disease',
    orientation='h',
    marker=dict(color=heart_disease_colors),
    text=heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=heart_disease['percentage'].apply(lambda x: f"{int(x)}% Heart Disease")
))

# Update layout
fig.update_layout(
    title=dict(
        text="Distribution of Heart Disease by Fasting Blood Sugar Level",
        font=dict(size=24, color="black", family="Arial", weight="bold"),
        x=0,
        xanchor="left"
    ),
    xaxis=dict(
        title=None,
        tickvals=[0, 20, 40, 60, 80, 100],
        ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
        range=[0, 100],
        tickfont=dict(size=15)
    ),
    yaxis=dict(
        title=None,
        tickfont=dict(size=15)
    ),
    barmode='stack', 
    plot_bgcolor="white",
    showlegend=True,
    legend=dict(
        orientation="h",       
        yanchor="bottom",
        y=-0.3,             
        xanchor="center",
        x=0.5,
        traceorder="normal",
        font=dict(size=15),
        itemclick="toggleothers",
        
    )
)

fig.show()
In [26]:
cp_mapping = {
     0 : 'Normal',
     1 : 'Having ST-T wave abnormality',
     2 : 'Left ventricular hypertrophy'
}
# Create the cp_category column based on the cp column
df['restecg_category'] = df['restecg'].map(cp_mapping)
df
Out[26]:
index age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target age_group sex_category cp_category fbs_category restecg_category
0 0 52 1 0 125 212.0 0 1 168 0 1.0 2 2 3 0 50-59 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality
1 1 53 1 0 140 203.0 1 0 155 1 3.1 0 0 3 0 50-59 Male Typical angina > 120 mg/dl Normal
2 2 70 1 0 145 174.0 0 1 125 1 2.6 0 0 3 0 70-79 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality
3 3 61 1 0 148 203.0 0 1 161 0 0.0 2 1 3 0 60-69 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality
4 4 62 0 0 138 294.0 1 1 106 0 1.9 1 3 2 0 60-69 Female Typical angina > 120 mg/dl Having ST-T wave abnormality
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297 723 68 0 2 120 211.0 0 0 115 0 1.5 1 0 2 1 60-69 Female Non-anginal pain <= 120 mg/dl Normal
298 733 44 0 2 108 141.0 0 1 175 0 0.6 1 0 2 1 40-49 Female Non-anginal pain <= 120 mg/dl Having ST-T wave abnormality
299 739 52 1 0 128 255.0 0 1 161 1 0.0 2 1 3 0 50-59 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality
300 843 59 1 3 160 273.0 0 0 125 0 0.0 2 0 2 0 50-59 Male Asymptomatic <= 120 mg/dl Normal
301 878 54 1 0 120 188.0 0 1 113 0 1.4 1 1 3 0 50-59 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality

302 rows × 20 columns

In [27]:
restecg_percentage = df.groupby(["restecg_category","target"])["target"].count().reset_index(name='total_target')
# Calculate the total for each cp_category
total_counts = restecg_percentage.groupby('restecg_category')['total_target'].transform('sum')

# Calculate the percentage for each target within each cp_category
restecg_percentage['percentage'] = (round(restecg_percentage['total_target'] / total_counts,2)) * 100
restecg_percentage
Out[27]:
restecg_category target total_target percentage
0 Having ST-T wave abnormality 0 56 37.0
1 Having ST-T wave abnormality 1 95 63.0
2 Left ventricular hypertrophy 0 3 75.0
3 Left ventricular hypertrophy 1 1 25.0
4 Normal 0 79 54.0
5 Normal 1 68 46.0
In [28]:
import plotly.graph_objects as go

# Define colors
color_target_0 = '#c5c5c5'    
color_target_1 = '#ff6b6b'    
highlight_color = '#ff0000'   

# Filter data for each target
no_heart_disease = restecg_percentage[restecg_percentage['target'] == 0]
heart_disease = restecg_percentage[restecg_percentage['target'] == 1]

# Identify the category with the highest heart disease percentage
if not heart_disease.empty:
    max_index = heart_disease['percentage'].idxmax()
    max_category = heart_disease.loc[max_index, 'restecg_category']
else:
    max_category = None  

# Assign colors for 'Heart Disease' bars, highlighting the max category
heart_disease_colors = heart_disease['restecg_category'].apply(
    lambda x: highlight_color if x == max_category else color_target_1
).tolist()

# Create the figure
fig = go.Figure()

# Add 'No Heart Disease' bars
fig.add_trace(go.Bar(
    y=no_heart_disease['restecg_category'],
    x=no_heart_disease['percentage'],
    name='No Heart Disease',
    orientation='h',
    marker=dict(color=color_target_0),
    text=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}% No Heart Disease")
))

# Add 'Heart Disease' bars with conditional coloring
fig.add_trace(go.Bar(
    y=heart_disease['restecg_category'],
    x=heart_disease['percentage'],
    name='Heart Disease',
    orientation='h',
    marker=dict(color=heart_disease_colors),
    text=heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=heart_disease['percentage'].apply(lambda x: f"{int(x)}% Heart Disease")
))

# Update layout
fig.update_layout(
    title=dict(
        text="Distribution of Heart Disease by Resting Electrocardiographic Results",
        font=dict(size=24, color="black", family="Arial", weight="bold"),
        x=0,
        xanchor="left"
    ),
    xaxis=dict(
        title=None,
        tickvals=[0, 20, 40, 60, 80, 100],
        ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
        range=[0, 100],
        tickfont=dict(size=15)
    ),
    yaxis=dict(
        title=None,
        tickfont=dict(size=15)
    ),
    barmode='stack',  
    plot_bgcolor="white",
    showlegend=True,
    legend=dict(
        orientation="h",       
        yanchor="bottom",
        y=-0.3,                 
        xanchor="center",
        x=0.5,
        traceorder="normal",
        font=dict(size=15),
        itemclick="toggleothers",
        
    )
)

# Show plot
fig.show()
In [29]:
exang_mapping = {
     0 : 'No',
     1 : 'Yes',
}
slope_mapping = {
     0 : 'Upsloping',
     1 : 'Flat',
     2: 'Downsloping'
}
ca_mapping = {
    0: 'No vessels colored',
    1: '1 major vessel colored',
    2: '2 major vessels colored',
    3: '3 major vessels colored'
}
thal_mapping = {
     1 : 'Normal',
     2: 'Fixed defect',
     3: 'Reversible defect'
}

# Create the cp_category column based on the cp column
df['exang_category'] = df['exang'].map(exang_mapping)
df['slope_category'] = df['slope'].map(slope_mapping)
df['ca_category'] = df['ca'].map(ca_mapping)
df['thal_category'] = df['thal'].map(thal_mapping)
df
Out[29]:
index age sex cp trestbps chol fbs restecg thalach exang ... target age_group sex_category cp_category fbs_category restecg_category exang_category slope_category ca_category thal_category
0 0 52 1 0 125 212.0 0 1 168 0 ... 0 50-59 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality No Downsloping 2 major vessels colored Reversible defect
1 1 53 1 0 140 203.0 1 0 155 1 ... 0 50-59 Male Typical angina > 120 mg/dl Normal Yes Upsloping No vessels colored Reversible defect
2 2 70 1 0 145 174.0 0 1 125 1 ... 0 70-79 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality Yes Upsloping No vessels colored Reversible defect
3 3 61 1 0 148 203.0 0 1 161 0 ... 0 60-69 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality No Downsloping 1 major vessel colored Reversible defect
4 4 62 0 0 138 294.0 1 1 106 0 ... 0 60-69 Female Typical angina > 120 mg/dl Having ST-T wave abnormality No Flat 3 major vessels colored Fixed defect
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297 723 68 0 2 120 211.0 0 0 115 0 ... 1 60-69 Female Non-anginal pain <= 120 mg/dl Normal No Flat No vessels colored Fixed defect
298 733 44 0 2 108 141.0 0 1 175 0 ... 1 40-49 Female Non-anginal pain <= 120 mg/dl Having ST-T wave abnormality No Flat No vessels colored Fixed defect
299 739 52 1 0 128 255.0 0 1 161 1 ... 0 50-59 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality Yes Downsloping 1 major vessel colored Reversible defect
300 843 59 1 3 160 273.0 0 0 125 0 ... 0 50-59 Male Asymptomatic <= 120 mg/dl Normal No Downsloping No vessels colored Fixed defect
301 878 54 1 0 120 188.0 0 1 113 0 ... 0 50-59 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality No Flat 1 major vessel colored Reversible defect

302 rows × 24 columns

In [30]:
#exang_category
exang_percentage = df.groupby(["exang_category","target"])["target"].count().reset_index(name='total_target')
total_counts = exang_percentage.groupby('exang_category')['total_target'].transform('sum')
exang_percentage['percentage'] = (round(exang_percentage['total_target'] / total_counts,2)) * 100

#slope_category
slope_percentage = df.groupby(["slope_category","target"])["target"].count().reset_index(name='total_target')
total_counts = slope_percentage.groupby('slope_category')['total_target'].transform('sum')
slope_percentage['percentage'] = (round(slope_percentage['total_target'] / total_counts,2)) * 100

#ca_category
ca_percentage = df.groupby(["ca_category","target"])["target"].count().reset_index(name='total_target')
total_counts = ca_percentage.groupby('ca_category')['total_target'].transform('sum')
ca_percentage['percentage'] = (round(ca_percentage['total_target'] / total_counts,2)) * 100

#thal_category
thal_percentage = df.groupby(["thal_category","target"])["target"].count().reset_index(name='total_target')
total_counts = thal_percentage.groupby('thal_category')['total_target'].transform('sum')
thal_percentage['percentage'] = (round(thal_percentage['total_target'] / total_counts,2)) * 100
In [31]:
import plotly.graph_objects as go

# Define colors
color_target_0 = '#c5c5c5'    
color_target_1 = '#ff6b6b'    
highlight_color = '#ff0000'   

# Filter data for each target
no_heart_disease = exang_percentage[exang_percentage['target'] == 0]
heart_disease = exang_percentage[exang_percentage['target'] == 1]

# Identify the category with the highest heart disease percentage
if not heart_disease.empty:
    max_index = heart_disease['percentage'].idxmax()
    max_category = heart_disease.loc[max_index, 'exang_category']
else:
    max_category = None

# Assign colors for 'Heart Disease' bars, highlighting the max category
heart_disease_colors = heart_disease['exang_category'].apply(
    lambda x: highlight_color if x == max_category else color_target_1
).tolist()

fig = go.Figure()

# Add 'No Heart Disease' bars
fig.add_trace(go.Bar(
    y=no_heart_disease['exang_category'],
    x=no_heart_disease['percentage'],
    name='No Heart Disease',
    orientation='h',
    marker=dict(color=color_target_0),
    text=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}% No Heart Disease")
))

# Add 'Heart Disease' bars with conditional coloring
fig.add_trace(go.Bar(
    y=heart_disease['exang_category'],
    x=heart_disease['percentage'],
    name='Heart Disease',
    orientation='h',
    marker=dict(color=heart_disease_colors),
    text=heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=heart_disease['percentage'].apply(lambda x: f"{int(x)}% Heart Disease")
))

fig.update_layout(
    title=dict(
        text="Distribution of Heart Disease by Exercise-Induced Angina ",
        font=dict(size=24, color="black", family="Arial", weight="bold"),
        x=0,
        xanchor="left"
    ),
    xaxis=dict(
        title=None,
        tickvals=[0, 20, 40, 60, 80, 100],
        ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
        range=[0, 100],
        tickfont=dict(size=15)
    ),
    yaxis=dict(
        title=None,
        tickfont=dict(size=15)
    ),
    barmode='stack',
    plot_bgcolor="white",
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.3,
        xanchor="center",
        x=0.5,
        traceorder="normal",
        font=dict(size=15),
        itemclick="toggleothers",
        
    )
)

fig.show()
In [32]:
import plotly.graph_objects as go

# Define colors
color_target_0 = '#c5c5c5'    
color_target_1 = '#ff6b6b'    
highlight_color = '#ff0000'   

# Filter data for each target
no_heart_disease = slope_percentage[slope_percentage['target'] == 0]
heart_disease = slope_percentage[slope_percentage['target'] == 1]

# Identify the category with the highest heart disease percentage
if not heart_disease.empty:
    max_index = heart_disease['percentage'].idxmax()
    max_category = heart_disease.loc[max_index, 'slope_category']
else:
    max_category = None  

# Assign colors for 'Heart Disease' bars, highlighting the max category
heart_disease_colors = heart_disease['slope_category'].apply(
    lambda x: highlight_color if x == max_category else color_target_1
).tolist()

fig = go.Figure()

# Add 'No Heart Disease' bars
fig.add_trace(go.Bar(
    y=no_heart_disease['slope_category'],
    x=no_heart_disease['percentage'],
    name='No Heart Disease',
    orientation='h',
    marker=dict(color=color_target_0),
    text=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}% No Heart Disease")
))

# Add 'Heart Disease' bars with conditional coloring
fig.add_trace(go.Bar(
    y=heart_disease['slope_category'],
    x=heart_disease['percentage'],
    name='Heart Disease',
    orientation='h',
    marker=dict(color=heart_disease_colors),
    text=heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=heart_disease['percentage'].apply(lambda x: f"{int(x)}% Heart Disease")
))

fig.update_layout(
    title=dict(
        text="Distribution of Heart Disease by The Slope of The Peak Exercise ST Segment",
        font=dict(size=24, color="black", family="Arial", weight="bold"),
        x=0,
        xanchor="left"
    ),
    xaxis=dict(
        title=None,
        tickvals=[0, 20, 40, 60, 80, 100],
        ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
        range=[0, 100],
        tickfont=dict(size=15)
    ),
    yaxis=dict(
        title=None,
        tickfont=dict(size=15)
    ),
    barmode='stack',  
    plot_bgcolor="white",
    showlegend=True,
    legend=dict(
        orientation="h",          
        yanchor="bottom",
        y=-0.3,                   
        xanchor="center",
        x=0.5,
        traceorder="normal",
        font=dict(size=15),
        itemclick="toggleothers",
        
    )
)

fig.show()
In [33]:
import plotly.graph_objects as go

# Define colors
color_target_0 = '#c5c5c5'
color_target_1 = '#ff6b6b'
highlight_color = '#ff0000'

# Filter data for each target
no_heart_disease = ca_percentage[ca_percentage['target'] == 0]
heart_disease = ca_percentage[ca_percentage['target'] == 1]

# Identify the category with the highest heart disease percentage
if not heart_disease.empty:
    max_index = heart_disease['percentage'].idxmax()
    max_category = heart_disease.loc[max_index, 'ca_category']
else:
    max_category = None 

heart_disease_colors = heart_disease['ca_category'].apply(
    lambda x: highlight_color if x == max_category else color_target_1
).tolist()

fig = go.Figure()

# Add 'No Heart Disease' bars
fig.add_trace(go.Bar(
    y=no_heart_disease['ca_category'],
    x=no_heart_disease['percentage'],
    name='No Heart Disease',
    orientation='h',
    marker=dict(color=color_target_0),
    text=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}% No Heart Disease")
))

# Add 'Heart Disease' bars with conditional coloring
fig.add_trace(go.Bar(
    y=heart_disease['ca_category'],
    x=heart_disease['percentage'],
    name='Heart Disease',
    orientation='h',
    marker=dict(color=heart_disease_colors),
    text=heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=heart_disease['percentage'].apply(lambda x: f"{int(x)}% Heart Disease")
))


fig.update_layout(
    title=dict(
        text="Distribution of Heart Disease by Major Vessels Colored by Fluoroscopy",
        font=dict(size=24, color="black", family="Arial", weight="bold"),
        x=0,
        xanchor="left"
    ),
    xaxis=dict(
        title=None,
        tickvals=[0, 20, 40, 60, 80, 100],
        ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
        range=[0, 100],
        tickfont=dict(size=15)
    ),
    yaxis=dict(
        title=None,
        tickfont=dict(size=15)
    ),
    barmode='stack',
    plot_bgcolor="white",
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.3,
        xanchor="center",
        x=0.5,
        traceorder="normal",
        font=dict(size=15),
        itemclick="toggleothers",
        
    )
)

fig.show()
In [34]:
ca_percentage
Out[34]:
ca_category target total_target percentage
0 1 major vessel colored 0 44 68.0
1 1 major vessel colored 1 21 32.0
2 2 major vessels colored 0 31 82.0
3 2 major vessels colored 1 7 18.0
4 3 major vessels colored 0 17 85.0
5 3 major vessels colored 1 3 15.0
6 No vessels colored 0 45 26.0
7 No vessels colored 1 130 74.0
In [35]:
import plotly.graph_objects as go


# Define colors
color_target_0 = '#c5c5c5'    
color_target_1 = '#ff6b6b'    
highlight_color = '#ff0000' 

no_heart_disease = thal_percentage[thal_percentage['target'] == 0]
heart_disease = thal_percentage[thal_percentage['target'] == 1]

# Identify the category with the highest heart disease percentage
if not heart_disease.empty:
    max_index = heart_disease['percentage'].idxmax()
    max_category = heart_disease.loc[max_index, 'thal_category']
else:
    max_category = None  

# Assign colors for 'Heart Disease' bars, highlighting the max category
heart_disease_colors = heart_disease['thal_category'].apply(
    lambda x: highlight_color if x == max_category else color_target_1
).tolist()

fig = go.Figure()

# Add 'No Heart Disease' bars
fig.add_trace(go.Bar(
    y=no_heart_disease['thal_category'],
    x=no_heart_disease['percentage'],
    name='No Heart Disease',
    orientation='h',
    marker=dict(color=color_target_0),
    text=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=no_heart_disease['percentage'].apply(lambda x: f"{int(x)}% No Heart Disease")
))

# Add 'Heart Disease' bars with conditional coloring
fig.add_trace(go.Bar(
    y=heart_disease['thal_category'],
    x=heart_disease['percentage'],
    name='Heart Disease',
    orientation='h',
    marker=dict(color=heart_disease_colors),
    text=heart_disease['percentage'].apply(lambda x: f"{int(x)}%"),
    textposition='inside',
    textfont=dict(size=15),
    hoverinfo='text',
    hovertext=heart_disease['percentage'].apply(lambda x: f"{int(x)}% Heart Disease")
))


fig.update_layout(
    title=dict(
        text="Distribution of Heart Disease by Thalassemia",
        font=dict(size=24, color="black", family="Arial", weight="bold"),
        x=0,
        xanchor="left"
    ),
    xaxis=dict(
        title=None,
        tickvals=[0, 20, 40, 60, 80, 100],
        ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
        range=[0, 100],
        tickfont=dict(size=15)
    ),
    yaxis=dict(
        title=None,
        tickfont=dict(size=15)
    ),
    barmode='stack',  
    plot_bgcolor="white",
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.3,
        xanchor="center",
        x=0.5,
        traceorder="normal",
        font=dict(size=15),
        itemclick="toggleothers",
        
    )
)

fig.show()
In [36]:
df
Out[36]:
index age sex cp trestbps chol fbs restecg thalach exang ... target age_group sex_category cp_category fbs_category restecg_category exang_category slope_category ca_category thal_category
0 0 52 1 0 125 212.0 0 1 168 0 ... 0 50-59 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality No Downsloping 2 major vessels colored Reversible defect
1 1 53 1 0 140 203.0 1 0 155 1 ... 0 50-59 Male Typical angina > 120 mg/dl Normal Yes Upsloping No vessels colored Reversible defect
2 2 70 1 0 145 174.0 0 1 125 1 ... 0 70-79 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality Yes Upsloping No vessels colored Reversible defect
3 3 61 1 0 148 203.0 0 1 161 0 ... 0 60-69 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality No Downsloping 1 major vessel colored Reversible defect
4 4 62 0 0 138 294.0 1 1 106 0 ... 0 60-69 Female Typical angina > 120 mg/dl Having ST-T wave abnormality No Flat 3 major vessels colored Fixed defect
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297 723 68 0 2 120 211.0 0 0 115 0 ... 1 60-69 Female Non-anginal pain <= 120 mg/dl Normal No Flat No vessels colored Fixed defect
298 733 44 0 2 108 141.0 0 1 175 0 ... 1 40-49 Female Non-anginal pain <= 120 mg/dl Having ST-T wave abnormality No Flat No vessels colored Fixed defect
299 739 52 1 0 128 255.0 0 1 161 1 ... 0 50-59 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality Yes Downsloping 1 major vessel colored Reversible defect
300 843 59 1 3 160 273.0 0 0 125 0 ... 0 50-59 Male Asymptomatic <= 120 mg/dl Normal No Downsloping No vessels colored Fixed defect
301 878 54 1 0 120 188.0 0 1 113 0 ... 0 50-59 Male Typical angina <= 120 mg/dl Having ST-T wave abnormality No Flat 1 major vessel colored Reversible defect

302 rows × 24 columns

In [ ]:
 

Clustering Numeric¶

In [37]:
data = df[['trestbps', 'chol',
       'thalach']]

corrmat= data.corr()
plt.figure(figsize=(10,5))  
sns.heatmap(corrmat, annot=True, cmap='RdYlGn', center=0)
Out[37]:
<Axes: >
In [38]:
data_with_intercept = sm.add_constant(data)

vif_data = pd.DataFrame()
vif_data["Variable"] = data_with_intercept.columns
vif_data["VIF"] = [variance_inflation_factor(data_with_intercept.values, i)
                   for i in range(data_with_intercept.shape[1])]

plt.figure(figsize=(10, 6))
sns.barplot(x='VIF', y='Variable', data=vif_data, palette='viridis')
plt.title('Variance Inflation Factor (VIF) for Each Variable', fontsize=15)
plt.xlabel('VIF', fontsize=12)
plt.ylabel('Variable', fontsize=12)

plt.show()
/tmp/ipykernel_3504/3758902526.py:9: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.


In [39]:
Elbow_M = KElbowVisualizer(KMeans(), k=10)
labels = Elbow_M.fit(data)

labels.show()
Out[39]:
<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
In [40]:
features = ['trestbps', 'thalach', 'chol']
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)
In [41]:
cluster_range = range(2, 11)
silhouette_scores = []
davies_bouldin_scores = []

for n_clusters in cluster_range:
    kmeans = KMeans(
        n_clusters=n_clusters,
        init='k-means++',
        n_init=10,
        max_iter=300,
        random_state=111
    )
    labels = kmeans.fit_predict(X_scaled)
    
    # Menghitung Silhouette Score
    silhouette_avg = silhouette_score(X_scaled, labels)
    silhouette_scores.append(silhouette_avg)
    
    # Menghitung Davies-Bouldin Index
    davies_bouldin_avg = davies_bouldin_score(X_scaled, labels)
    davies_bouldin_scores.append(davies_bouldin_avg)
# Membuat DataFrame untuk menyimpan hasil
results_df = pd.DataFrame({
    'Number of Clusters': cluster_range,
    'Silhouette Score': silhouette_scores,
    'Davies-Bouldin Index': davies_bouldin_scores
})

results_df
results_df.sort_values(by="Silhouette Score", ascending=False).style.background_gradient(cmap='Greys')
Out[41]:
  Number of Clusters Silhouette Score Davies-Bouldin Index
2 4 0.267604 1.136922
4 6 0.264203 1.100376
0 2 0.262386 1.583724
1 3 0.254036 1.289038
6 8 0.251404 1.088724
3 5 0.249821 1.102119
7 9 0.246821 1.099206
8 10 0.239248 1.100932
5 7 0.235945 1.112547
In [42]:
# Visualisasi Silhouette Score dan Davies-Bouldin Index
plt.figure(figsize=(14, 7))

# Silhouette Score
plt.subplot(1, 2, 1)
plt.plot(cluster_range, silhouette_scores, marker='o', linestyle='-', color='b')
plt.title('Silhouette Score vs Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(cluster_range)
plt.grid(True)

# Davies-Bouldin Index
plt.subplot(1, 2, 2)
plt.plot(cluster_range, davies_bouldin_scores, marker='o', linestyle='-', color='r')
plt.title('Davies-Bouldin Index vs Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Davies-Bouldin Index')
plt.xticks(cluster_range)
plt.grid(True)

plt.tight_layout()
plt.show()
In [43]:
# 1. Preprocessing Data (Penskalaan)
features = ['trestbps', 'thalach', 'chol']
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=4, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

centers_scaled = kmeans.cluster_centers_
centers = scaler.inverse_transform(centers_scaled)
centers_df = pd.DataFrame(centers, columns=features)
centers_df['cluster'] = range(4) 

fig = px.scatter_3d(
    df,
    x='trestbps',
    y='thalach',
    z='chol',
    color=df['cluster'].astype(str),  
    symbol=df['target'].astype(str),  
    color_discrete_sequence=['green', 'orange', 'blue', 'red'],  
    size_max=10,
    hover_data={
        'trestbps': True,
        'thalach': True,
        'chol': True,
        'target': True,
        'cluster': True
    },
    labels={
        'trestbps': 'Tekanan Darah Istirahat (trestbps)',
        'thalach': 'Maksimum Detak Jantung (thalach)',
        'chol': 'Kolesterol (chol)',
        'target': 'Target',
        'cluster': 'Cluster'
    },
    title='3D Scatter Plot dengan K-Means Clustering (4 Cluster)',
    width=800,
    height=1000
)

# Menambahkan titik pusat cluster
fig.add_trace(
    px.scatter_3d(
        centers_df,
        x='trestbps',
        y='thalach',
        z='chol',
        color='cluster',
        color_discrete_sequence=['black']*len(centers_df),
        symbol='cluster',
        size=[12]*len(centers_df),
        opacity=1,
        hover_data={
            'trestbps': True,
            'thalach': True,
            'chol': True,
            'cluster': True
        },
        labels={
            'trestbps': 'Tekanan Darah Istirahat (trestbps)',
            'thalach': 'Maksimum Detak Jantung (thalach)',
            'chol': 'Kolesterol (chol)',
            'cluster': 'Cluster'
        }
    ).data[0] 
)

# Menyesuaikan layout untuk kejelasan
fig.update_layout(
    scene=dict(
        xaxis=dict(title='Tekanan Darah Istirahat (trestbps)', showticklabels=False, backgroundcolor="rgb(230, 230, 230)"),
        yaxis=dict(title='Maksimum Detak Jantung (thalach)', showticklabels=False, backgroundcolor="rgb(230, 230, 230)"),
        zaxis=dict(title='Kolesterol (chol)', showticklabels=False, backgroundcolor="rgb(230, 230, 230)"),
        camera=dict(
            eye=dict(x=1.5, y=1.5, z=1.5)  
        )
    ),
    legend=dict(
        title='Keterangan',
        x=0.85,
        y=0.95
    ),
    title=dict(
        x=0.5,
        y=0.95,
        xanchor='center',
        yanchor='top'
    )
)

fig.update_traces(marker=dict(size=5, opacity=0.8))


fig.show()
In [44]:
import plotly.express as px

# Visualisasi 2D Scatter Plot: trestbps vs thalach
fig1 = px.scatter(
    df,
    x='trestbps',
    y='thalach',
    color=df['cluster'].astype(str),
    symbol=df['target'].astype(str),
    color_discrete_sequence=['green', 'orange', 'blue', 'red'],
    hover_data={
        'trestbps': True,
        'thalach': True,
        'chol': True,
        'target': True,
        'cluster': True
    },
    labels={
        'trestbps': 'Tekanan Darah Istirahat (trestbps)',
        'thalach': 'Maksimum Detak Jantung (thalach)',
        'target': 'Target',
        'cluster': 'Cluster'
    },
    title='2D Scatter Plot: trestbps vs thalach'
)
fig1.show()

# Visualisasi 2D Scatter Plot: trestbps vs chol
fig2 = px.scatter(
    df,
    x='trestbps',
    y='chol',
    color=df['cluster'].astype(str),
    symbol=df['target'].astype(str),
    color_discrete_sequence=['green', 'orange', 'blue', 'red'],
    hover_data={
        'trestbps': True,
        'thalach': True,
        'chol': True,
        'target': True,
        'cluster': True
    },
    labels={
        'trestbps': 'Tekanan Darah Istirahat (trestbps)',
        'chol': 'Kolesterol (chol)',
        'target': 'Target',
        'cluster': 'Cluster'
    },
    title='2D Scatter Plot: trestbps vs chol'
)
fig2.show()

# Visualisasi 2D Scatter Plot: thalach vs chol
fig3 = px.scatter(
    df,
    x='thalach',
    y='chol',
    color=df['cluster'].astype(str),
    symbol=df['target'].astype(str),
    color_discrete_sequence=['green', 'orange', 'blue', 'red'],
    hover_data={
        'trestbps': True,
        'thalach': True,
        'chol': True,
        'target': True,
        'cluster': True
    },
    labels={
        'thalach': 'Maksimum Detak Jantung (thalach)',
        'chol': 'Kolesterol (chol)',
        'target': 'Target',
        'cluster': 'Cluster'
    },
    title='2D Scatter Plot: thalach vs chol'
)
fig3.show()
In [45]:
# Membuat Statistik Deskriptif per Cluster
cluster_summary = df.groupby('cluster').describe()


# Menampilkan secara ringkas hanya statistik yang relevan (misalnya mean dan std)
cluster_summary_mean_std = df.groupby('cluster')[['trestbps', 'thalach', 'chol']].agg(['min', 'max','median',"std"])
cluster_summary_mean_std
Out[45]:
trestbps thalach chol
min max median std min max median std min max median std
cluster
0 94 150 125.0 12.556069 142 202 165.0 13.170838 126.0 271.0 223.0 27.786386
1 100 164 126.0 13.633521 71 143 122.0 14.935229 131.0 294.0 216.0 39.440937
2 100 160 130.0 10.601027 109 182 153.0 15.249750 253.0 417.0 303.0 37.112065
3 140 200 158.0 13.287866 108 195 150.0 18.367360 164.0 327.0 246.5 35.126418

Analisis Setiap Cluster Cluster 0:

Tekanan Darah Istirahat (trestbps): 94 - 150 Detak Jantung Maksimum (thalach): 142 - 202 Kolesterol (chol): 126 - 271 Karakteristik: Detak jantung maksimum yang tinggi (sampai 202) dan kolesterol yang rendah hingga sedang (126-271). Nama Usulan: "Cluster Detak Jantung Tinggi" Cluster 1:

Tekanan Darah Istirahat (trestbps): 100 - 164 Detak Jantung Maksimum (thalach): 71 - 143 Kolesterol (chol): 131 - 294 Karakteristik: Detak jantung maksimum yang lebih rendah (maksimum 143) dengan rentang kolesterol yang sedang (131-294). Nama Usulan: "Cluster Kolesterol Sedang dan Detak Jantung Rendah" Cluster 2:

Tekanan Darah Istirahat (trestbps): 100 - 160 Detak Jantung Maksimum (thalach): 109 - 182 Kolesterol (chol): 253 - 417 Karakteristik: Kolesterol tinggi (253-417) dengan detak jantung maksimum menengah. Nama Usulan: "Cluster Kolesterol Tinggi" Cluster 3:

Tekanan Darah Istirahat (trestbps): 140 - 200 Detak Jantung Maksimum (thalach): 108 - 195 Kolesterol (chol): 164 - 327 Karakteristik: Tekanan darah istirahat yang tinggi (140-200) dan rentang kolesterol yang lebih tinggi. Nama Usulan: "Cluster Tekanan Darah Tinggi"

In [46]:
# Menentukan nama cluster berdasarkan karakteristik statistik deskriptif
cluster_names = {
    0: 'Cluster Detak Jantung Tinggi',
    1: 'Cluster Kolesterol Sedang dan Detak Jantung Rendah',
    2: 'Cluster Kolesterol Tinggi',
    3: 'Cluster Tekanan Darah Tinggi'
}

# Menambahkan kolom baru dengan nama cluster
df['cluster_name'] = df['cluster'].map(cluster_names)
# Menampilkan hasil DataFrame dengan nama cluster
df[['trestbps', 'thalach', 'chol', 'cluster', 'cluster_name']]
Out[46]:
trestbps thalach chol cluster cluster_name
0 125 168 212.0 0 Cluster Detak Jantung Tinggi
1 140 155 203.0 0 Cluster Detak Jantung Tinggi
2 145 125 174.0 1 Cluster Kolesterol Sedang dan Detak Jantung Re...
3 148 161 203.0 3 Cluster Tekanan Darah Tinggi
4 138 106 294.0 1 Cluster Kolesterol Sedang dan Detak Jantung Re...
... ... ... ... ... ...
297 120 115 211.0 1 Cluster Kolesterol Sedang dan Detak Jantung Re...
298 108 175 141.0 0 Cluster Detak Jantung Tinggi
299 128 161 255.0 0 Cluster Detak Jantung Tinggi
300 160 125 273.0 3 Cluster Tekanan Darah Tinggi
301 120 113 188.0 1 Cluster Kolesterol Sedang dan Detak Jantung Re...

302 rows × 5 columns

In [47]:
df_numeric = df[["target","cluster_name"]]
df_numeric.head()
Out[47]:
target cluster_name
0 0 Cluster Detak Jantung Tinggi
1 0 Cluster Detak Jantung Tinggi
2 0 Cluster Kolesterol Sedang dan Detak Jantung Re...
3 0 Cluster Tekanan Darah Tinggi
4 0 Cluster Kolesterol Sedang dan Detak Jantung Re...
In [48]:
df_summary = df_numeric.groupby(["cluster_name","target"])["target"].count().reset_index(name='total_target')
df_summary
Out[48]:
cluster_name target total_target
0 Cluster Detak Jantung Tinggi 0 29
1 Cluster Detak Jantung Tinggi 1 88
2 Cluster Kolesterol Sedang dan Detak Jantung Re... 0 47
3 Cluster Kolesterol Sedang dan Detak Jantung Re... 1 20
4 Cluster Kolesterol Tinggi 0 36
5 Cluster Kolesterol Tinggi 1 34
6 Cluster Tekanan Darah Tinggi 0 26
7 Cluster Tekanan Darah Tinggi 1 22
In [49]:
# Pivot table untuk mendapatkan total per cluster dan target
pivot_df = df_summary.pivot(index='cluster_name', columns='target', values='total_target').reset_index()
pivot_df.columns = ['cluster_name', 'no_heart_disease', 'heart_disease']

# Menghitung total kasus di setiap cluster
pivot_df['total_cases'] = pivot_df['no_heart_disease'] + pivot_df['heart_disease']

# Menghitung peluang terkena serangan jantung di setiap cluster
pivot_df['heart_disease_probability (%)'] = round((pivot_df['heart_disease'] / pivot_df['total_cases']) * 100, 0).astype(int)


# Menampilkan hasil peluang serangan jantung per cluster
pivot_df_sorted = pivot_df[['cluster_name', 'heart_disease_probability (%)']].sort_values(by="heart_disease_probability (%)", ascending=False)
pivot_df_styled = pivot_df_sorted.style.background_gradient(cmap='Reds')
pivot_df_styled
Out[49]:
  cluster_name heart_disease_probability (%)
0 Cluster Detak Jantung Tinggi 75
2 Cluster Kolesterol Tinggi 49
3 Cluster Tekanan Darah Tinggi 46
1 Cluster Kolesterol Sedang dan Detak Jantung Rendah 30
In [50]:
df.columns
Out[50]:
Index(['index', 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target',
       'age_group', 'sex_category', 'cp_category', 'fbs_category',
       'restecg_category', 'exang_category', 'slope_category', 'ca_category',
       'thal_category', 'cluster', 'cluster_name'],
      dtype='object')
In [51]:
df["oldpeak"]
Out[51]:
0      1.0
1      3.1
2      2.6
3      0.0
4      1.9
      ... 
297    1.5
298    0.6
299    0.0
300    0.0
301    1.4
Name: oldpeak, Length: 302, dtype: float64
In [52]:
import plotly.graph_objects as go

fig = go.Figure()

# Add histogram trace for "No Heart Disease" (target = 0) in grey
fig.add_trace(go.Histogram(
    x=df[df['target'] == 0]['oldpeak'],
    marker=dict(color='grey'),
    name='No Heart Disease',
    xbins=dict(size=0.5)  # Sesuaikan ukuran bin jika perlu
))

# Add histogram trace for "Heart Disease" (target = 1) in red
fig.add_trace(go.Histogram(
    x=df[df['target'] == 1]['oldpeak'],
    marker=dict(color='#C62E2E'),
    name='Heart Disease',
    xbins=dict(size=0.5)  # Sesuaikan ukuran bin jika perlu
))

# Add dashed line for average count
fig.add_shape(
    type="line",
    x0=0, x1=1, y0=18, y1=18,  
    xref="paper", yref="y",
    line=dict(color="#4A4947", width=2, dash="dash")
)

# Update layout
fig.update_layout(
    title=dict(
        text="Heart Disease Count by Oldpeak",
        font=dict(size=20, color="black", family="Arial", weight="bold"),
        x=0.065,
        xanchor="left"
    ),
    xaxis=dict(
        title="Old Peak",
        title_font=dict(size=14, weight="bold"),
        tickmode="linear",
        dtick=0.5,
        showline=True,
        linecolor="black",
        linewidth=2
    ),
    yaxis=dict(
        title=None,
        showline=True,
        showticklabels=False,
        linewidth=2,
    ),
    plot_bgcolor="white",
    bargap=0.2,  
    barmode='group' 
)

# Add hover template for custom text on hover
fig.update_traces(
    hovertemplate='Old Peak Range: %{x}<br>Count: %{y}',
)

# Adjust annotation with flexible positioning
fig.add_annotation(
    text="The old peak level in certain ranges is more likely<br>to be associated with heart disease cases.<br>This histogram shows distribution across cases.",
    xref='paper',
    yref='paper',
    x=0.345, 
    y=1.1,  
    showarrow=False,
    xanchor="right",
    font=dict(size=12, color='black'),
    align='left'
)

# Adjust annotation with flexible positioning
fig.add_annotation(
    text=f'<b>Heart Disease avg (18)</b> ', 
    xref='paper',
    yref='paper',
    x=1.03,  
    y=0.23,  
    showarrow=False,
    xanchor="right",
    font=dict(size=12, color='#4A4947'),
    align='left'
)

# Show plot
fig.show()
In [53]:
def categorize_oldpeak(value):
    if -0.2 <= value <= 0.2:
        return '-0.2-0.2'
    elif 0.3 <= value <= 0.7:
        return '0.3-0.7'
    elif 0.8 <= value <= 1.2:
        return '0.8-1.2'
    elif 1.3 <= value <= 1.7:
        return '1.3-1.7'
    elif 1.8 <= value <= 2.2:
        return '1.8-2.2'
    elif 2.3 <= value <= 2.7:
        return '2.3-2.7'
    elif 2.8 <= value <= 3.2:
        return '2.8-3.2'
    elif 3.3 <= value <= 3.7:
        return '3.3-3.7'
    elif 3.8 <= value <= 4.2:
        return '3.8-4.2'
    elif 4.3 <= value <= 4.7:
        return '4.3-4.7'
    elif 4.8 <= value <= 5.2:
        return '4.8-5.2'
    elif 5.3 <= value <= 5.7:
        return '5.3-5.7'
    elif 5.8 <= value <= 6.0:
        return '5.8-6.0'
    else:
        return 'Out of Range'

# Terapkan fungsi untuk membuat kolom kategori
df['oldpeak_category'] = df['oldpeak'].apply(categorize_oldpeak)
df['oldpeak_category']
Out[53]:
0       0.8-1.2
1       2.8-3.2
2       2.3-2.7
3      -0.2-0.2
4       1.8-2.2
         ...   
297     1.3-1.7
298     0.3-0.7
299    -0.2-0.2
300    -0.2-0.2
301     1.3-1.7
Name: oldpeak_category, Length: 302, dtype: object
In [54]:
oldpeak_target_avg = df.groupby(["oldpeak_category","target"])["target"].count().reset_index(name='total_oldpeak_category')

oldpeak_target_avg = oldpeak_target_avg[oldpeak_target_avg["target"]==1]
oldpeak_target_avg = oldpeak_target_avg.total_oldpeak_category.mean()
f'Nilai rata-rata dari pengidap serangan jantung di setiap usia {round(oldpeak_target_avg,0)}'
Out[54]:
'Nilai rata-rata dari pengidap serangan jantung di setiap usia 18.0'
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: